# rm(list=ls())
# create class
setClass("waldCI",
slots = c(level = "numeric",
mean = "numeric",
sterr = "numeric",
lb = "numeric",
ub = "numeric"))PS4
Problem 1
Create a class to represent Wald-style normal approximation Confidence Intervals. Do this using S4.
- For the waldCI class, define the following:
- A constructor, which takes in a confidence level (0,1) and either a mean and standard error, or a lower and upper bound. This should be a custom constructor, not new() or waldCI().
- A validator.
For this section, I created a validator first so that I can use it in the constructor.
# create validator for use in constructor
setValidity("waldCI", function(object) {
# needed for part c
# negative standard error
if (object@sterr <= 0) {
stop(paste("@sterr = ", object@sterr, " cannot be negative"))
}
# lb > ub
if (object@lb >= object@ub) {
stop(paste("@lb = ", object@lb, " cannot be larger than @ub = ", object@ub))
}
# infinite bounds
if (!is.finite(object@lb) || !is.finite(object@ub)) {
stop(paste("Bounds must be finite"))
}
return(TRUE)
})Class "waldCI" [in ".GlobalEnv"]
Slots:
Name: level mean sterr lb ub
Class: numeric numeric numeric numeric numeric
# constructor
waldCI <- function(level, mean = NULL, sterr = NULL, lb = NULL, ub = NULL) {
# check level
if (missing(level) || level <= 0 || level >= 1) {
stop(paste("Confidence level must be a value between (0,1)"))
}
# fill in empty slots
alpha <- 1 - level
z <- qnorm(1 - alpha / 2)
if (!is.null(mean) && !is.null(sterr)) {
lb <- mean - z * sterr
ub <- mean + z * sterr
} else if (!is.null(lb) && !is.null(ub)) {
mean <- (ub + lb) / 2
sterr <- (ub - lb) / (2 * z)
} else {
stop("You must provide either (mean and se) or (lower and upper).")
}
object <- new("waldCI", level = level, mean = mean, sterr = sterr, lb = lb, ub = ub)
validObject(object)
return(object)
}- A show method.
setMethod("show", "waldCI",
function(object) {
cat("Wald Confidence Interval\n")
cat(" Confidence level: ", object@level, "\n")
cat(" Mean: ", object@mean, "\n")
cat(" Standard error: ", object@sterr, "\n")
cat(" CI: [", object@lb, ", ", object@ub, "]\n")
return(invisible(object))
}
)- Accessors: lb, ub, mean, sterr, level.
setGeneric("lb", function(object, ...) standardGeneric("lb"))[1] "lb"
setMethod("lb", "waldCI", function(object, ...) object@lb)
setGeneric("ub", function(object, ...) standardGeneric("ub"))[1] "ub"
setMethod("ub", "waldCI", function(object, ...) object@ub)
setGeneric("mean", function(object, ...) standardGeneric("mean"))Creating a new generic function for 'mean' in the global environment
[1] "mean"
setMethod("mean", "waldCI", function(object, ...) object@mean)
setGeneric("sterr", function(object, ...) standardGeneric("sterr"))[1] "sterr"
setMethod("sterr", "waldCI", function(object, ...) object@sterr)
setGeneric("level", function(object, ...) standardGeneric("level"))[1] "level"
setMethod("level", "waldCI", function(object, ...) object@level)- Setters: lb, ub, mean, sterr, level. Be sure to validate the resulting waldCI.
setGeneric("lb<-",
function(object, value, ...) {
standardGeneric("lb<-")
})[1] "lb<-"
setMethod("lb<-", "waldCI",
function(object, value, ...) {
object@lb <- value
validObject(object)
return(object)
}
)
setGeneric("ub<-",
function(object, value, ...) {
standardGeneric("ub<-")
})[1] "ub<-"
setMethod("ub<-", "waldCI",
function(object, value, ...) {
object@ub <- value
validObject(object)
return(object)
}
)
setGeneric("mean<-",
function(object, value, ...) {
standardGeneric("mean<-")
})[1] "mean<-"
setMethod("mean<-", "waldCI",
function(object, value, ...) {
object@mean <- value
validObject(object)
return(object)
}
)
setGeneric("sterr<-",
function(object, value, ...) {
standardGeneric("sterr<-")
})[1] "sterr<-"
setMethod("sterr<-", "waldCI",
function(object, value, ...) {
object@sterr <- value
validObject(object)
return(object)
}
)
setGeneric("level<-",
function(object, value, ...) {
standardGeneric("level<-")
})[1] "level<-"
setMethod("level<-", "waldCI",
function(object, value, ...) {
object@level <- value
validObject(object)
return(object)
}
)- A contains method, returning a logical of whether a value is within a CI.
setGeneric("contains", function(object, value, ...) standardGeneric("contains"))[1] "contains"
setMethod("contains", "waldCI",
function(object, value, ...) {
return(value >= object@lb && value <= object@ub)
})- An overlap method, that takes in two waldCI’s, and returns a logical of whether the two confidence intervals overlap.
setGeneric("overlap", function(ci1, ci2, ...) standardGeneric("overlap"))[1] "overlap"
setMethod("overlap", signature(ci1 = "waldCI", ci2 = "waldCI"),
function(ci1, ci2, ...) {
return(max(ci1@lb, ci2@lb) <= min(ci1@ub, ci2@ub))
})- as.numeric to return c(lb, ub). (Hint: The second argument of setGeneric is not needed when an existing s3 function uses the .Primitive function.)
setMethod("as.numeric", "waldCI", function(x, ...) {
c(x@lb, x@ub)
})- transformCI which takes in a function and a waldCI, and returns the transformed waldCI object. Warn the user that only monotonic functions make sense.
setGeneric("transformCI", function(object, foo, ...) standardGeneric("transformCI"))[1] "transformCI"
setMethod("transformCI", signature(object = "waldCI", foo = "function"),
function(object, foo, ...) {
warning("Warning: Only monotonic functions make sense for transformations")
new_lb <- foo(object@lb)
new_ub <- foo(object@ub)
waldCI(level = object@level,
lb = new_lb,
ub = new_ub)
})- Use your waldCI class to create three objects:
ci1: (17.2, 24.7), 95%
ci2: mean: 13, standard error: 2.5, 99%
ci3: (27.43, 39.22), 75%
ci1 <- waldCI(level = 0.95, lb = 17.2, ub = 24.7)
show(ci1)Wald Confidence Interval
Confidence level: 0.95
Mean: 20.95
Standard error: 1.9133
CI: [ 17.2 , 24.7 ]
ci2 <- waldCI(level = 0.99, mean = 13, sterr = 2.5)
show(ci2)Wald Confidence Interval
Confidence level: 0.99
Mean: 13
Standard error: 2.5
CI: [ 6.560427 , 19.43957 ]
ci3 <- waldCI(level = 0.75, lb = 27.43, ub = 39.22)
show(ci3)Wald Confidence Interval
Confidence level: 0.75
Mean: 33.325
Standard error: 5.12453
CI: [ 27.43 , 39.22 ]
Evaluate the following code:
ci1Wald Confidence Interval
Confidence level: 0.95
Mean: 20.95
Standard error: 1.9133
CI: [ 17.2 , 24.7 ]
ci2Wald Confidence Interval
Confidence level: 0.99
Mean: 13
Standard error: 2.5
CI: [ 6.560427 , 19.43957 ]
ci3Wald Confidence Interval
Confidence level: 0.75
Mean: 33.325
Standard error: 5.12453
CI: [ 27.43 , 39.22 ]
as.numeric(ci1)[1] 17.2 24.7
as.numeric(ci2)[1] 6.560427 19.439573
as.numeric(ci3)[1] 27.43 39.22
lb(ci2)[1] 6.560427
ub(ci2)[1] 19.43957
mean(ci1)[1] 20.95
sterr(ci3)[1] 5.12453
level(ci2)[1] 0.99
lb(ci2) <- 10.5
mean(ci3) <- 34
level(ci3) <- .8
contains(ci1, 17)[1] FALSE
contains(ci3, 44)[1] FALSE
overlap(ci1, ci2)[1] TRUE
eci1 <- transformCI(ci1, sqrt)Warning in transformCI(ci1, sqrt): Warning: Only monotonic functions make sense
for transformations
eci1Wald Confidence Interval
Confidence level: 0.95
Mean: 4.558599
Standard error: 0.2098562
CI: [ 4.147288 , 4.969909 ]
mean(transformCI(ci2, log))Warning in transformCI(ci2, log): Warning: Only monotonic functions make sense
for transformations
[1] 2.659343
- Show that your validator does not allow the creation of invalid confidence intervals:
negative standard error
lb > ub
Infinite bounds
invalid use of the setters
# negative sterr
try({waldCI(level = 0.95, mean = 10, sterr = -2)})Error in validityMethod(object) : @sterr = -2 cannot be negative
# lb > ub
try({waldCI(level = 0.95, lb = 10, ub = 5)})Error in validityMethod(object) :
@sterr = -1.27553364231164 cannot be negative
# infinite bounds
try({waldCI(level = 0.95, lb = -Inf, ub = Inf)})Error in validityMethod(object) : Bounds must be finite
# invalid use of setters
try({
ci <- waldCI(level = 0.95, mean = 10, sterr = 2)
lb(ci) <- 15 # This should trigger validation failure if ub < lb now
ub(ci) <- 5
validObject(ci)
})Error in validityMethod(object) :
@lb = 15 cannot be larger than @ub = 13.9199279690801
Problem 3
First, I load the ggplot2 library and the data. Currently, the “date” variable is a character variable, so I change the it to be of type Date.
# install.packages("plotly")
library(plotly)Warning: package 'plotly' was built under R version 4.4.3
Loading required package: ggplot2
Warning: package 'ggplot2' was built under R version 4.4.3
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
library(dplyr)
Attaching package: 'dplyr'
The following object is masked _by_ '.GlobalEnv':
contains
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(lubridate)
Attaching package: 'lubridate'
The following objects are masked from 'package:base':
date, intersect, setdiff, union
covid <- read.csv("data/us-states.csv")
covid$date <- as.Date(covid$date)- How many major and minor spikes in cases were there?
One additional thing I did here was to filter the total cases to get rid of negative cases. To me, it doesn’t make sense to have negative cases.
covid_date <- covid %>%
group_by(date) %>%
summarise(total_cases = sum(cases)) %>%
filter(total_cases > 0)
covid_week <- covid %>%
mutate(week = floor_date(date, "week")) %>%
group_by(week) %>%
summarise(cases_week_avg = sum(cases) / 7)
p <- plot_ly() %>%
add_trace(
data = covid_date,
x = ~date, y = ~total_cases,
type = "scatter", mode = "markers",
marker = list(
color = ~sqrt(total_cases),
colorscale = "Reds",
showscale = FALSE
),
name = "Daily Cases") %>%
add_trace(
data = covid_week,
x = ~week, y = ~cases_week_avg,
type = "scatter", mode = "lines",
line = list(width = 3),
name = "Weekly Average"
) %>%
layout(
title = "Major and Minor COVID-19 Case Spikes in the U.S.",
xaxis = list(title = "Date",
range = c(min(covid_date$date), max(covid_date$date))),
yaxis = list(title = "Total Daily New Cases")
)
# add interactivity for fun
p <- p %>% layout(
updatemenus = list(
list(
type = "buttons",
x = -0.1,
buttons = list(
list(
label = "Daily",
method = "restyle",
args = list(
list(visible = list(TRUE, FALSE),
"marker.opacity" = list(0.8, NA))
)
),
list(
label = "Weekly",
method = "restyle",
args = list("visible", list(FALSE, TRUE))
),
list(
label = "Both",
method = "restyle",
args = list(
list(visible = list(TRUE, TRUE),
"marker.opacity" = list(0.4, NA))
)
)
))
)
)
p Based on the graph, I can estimate one major spike, and around 3-4 minor spikes.
- For the states with the highest and lowest overall rates per population, what differences do you see in their trajectories over time?
I was inspired by the solution for PS4 for this updated solution. First, let’s visualize the states with the highest and lowest overall rates per population. I chose to use a bar graph.
covid_state <- covid %>%
group_by(state) %>%
summarize(total = sum(cases_avg_per_100k)) %>%
ungroup()
p <- plot_ly() %>%
add_trace(
data = covid_state,
y = ~state, x = ~total,
type = "bar",
marker = list(
color = ~total,
colorscale = "Portland",
showscale = FALSE),
name = "Daily Cases",
text = ~paste(
"State:", state,
"<br>Total Cases per 100k:", round(total, 2)),
textposition = "none",
hoverinfo = "text") %>%
layout(
title = "Overall COVID Rates per Population by State",
xaxis = list(title = "Total Covid Cases per 100k"),
yaxis = list(title = "State"),
height = 900
)Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
p From the plot above, we can see that the states with the highest overall rates are Rhode Island and Alaska. The states with the lowest overall rates are Maine and Maryland. Hence, I will look more closely at these 5 states in the next plot.
states <- c("Rhode Island", "Alaska", "Maine", "Maryland")
p <- plot_ly(covid %>% filter(state %in% states),
x = ~date, y = ~cases_avg_per_100k,
color = ~state,
colors = c(
"Rhode Island" = "firebrick",
"Alaska" = "darksalmon",
"Maine" = "royalblue",
"Maryland" = "skyblue"
),
type = "scatter", mode = "lines",
line = list(width = 2),
text = ~paste(
"State:", state,
"<br>Date:", date,
"<br>Cases per 100k:", round(cases_avg_per_100k, 2)),
hoverinfo = "text") %>%
layout(
title = list(text = "COVID-19 Case Rates by State (Highest & Lowest Overall Rates)"),
xaxis = list(title = "Date"),
yaxis = list(title = "Average Cases Per 100k Residents"),
legend = list(title = list(text = "State"))
)
p From the plot above, we can see states with the highest overall rates (RI and Alaska) show much more prominent spikes as compared to the states with the lowest overall rates (Maine and Maryland).
- Identify, to the best of your ability without a formal test, the first five states to experience Covid in a substantial way.
covid_start <- covid %>%
filter(date > as.Date("2020-03-20"),
date < as.Date("2020-04-20"),
cases_avg_per_100k > 15)
spectral <- grDevices::hcl.colors(length(unique(covid_start$state)), "Spectral")
p <- plot_ly(covid_start,
x = ~date, y = ~cases_avg_per_100k,
color = ~state, colors = spectral,
type = "scatter", mode = "lines",
line = list(width = 2),
text = ~paste(
"State:", state,
"<br>Date:", date,
"<br>Cases per 100k:", round(cases_avg_per_100k, 2)),
hoverinfo = "text") %>%
layout(
title = list(text = "COVID-19 Case Rates by State (Mar–Apr 2020)"),
xaxis = list(title = "Date"),
yaxis = list(title = "Average Cases Per 100k Residents"),
legend = list(title = list(text = "State"))
)
p By restricting the plot to show the first few months of COVID, and by filtering such that only states above a certain rate are shown, we can see that the first four states to experience COVID in a substantial way are New York, then New Jersey, then Louisiana, then Guam. The fifth state can be debated to be between Connecticut and Massachusetts.
Attribution
Referred to PS4 solutions to refine my plots for PS5 Q3
Plotly R documentation: https://plotly.com/r/